1   /*
2    * Licensed to the Apache Software Foundation (ASF) under one or more
3    * contributor license agreements.  See the NOTICE file distributed with
4    * this work for additional information regarding copyright ownership.
5    * The ASF licenses this file to You under the Apache License, Version 2.0
6    * (the "License"); you may not use this file except in compliance with
7    * the License.  You may obtain a copy of the License at
8    *
9    *     http://www.apache.org/licenses/LICENSE-2.0
10   *
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   */
17  
18  /*
19  This file was partially derived from the
20  original CIIR University of Massachusetts Amherst version of KStemmer.java (license for
21  the original shown below)
22   */
23  
24  /*
25   Copyright © 2003,
26   Center for Intelligent Information Retrieval,
27   University of Massachusetts, Amherst.
28   All rights reserved.
29  
30   Redistribution and use in source and binary forms, with or without modification,
31   are permitted provided that the following conditions are met:
32  
33   1. Redistributions of source code must retain the above copyright notice, this
34   list of conditions and the following disclaimer.
35  
36   2. Redistributions in binary form must reproduce the above copyright notice,
37   this list of conditions and the following disclaimer in the documentation
38   and/or other materials provided with the distribution.
39  
40   3. The names "Center for Intelligent Information Retrieval" and
41   "University of Massachusetts" must not be used to endorse or promote products
42   derived from this software without prior written permission. To obtain
43   permission, contact info@ciir.cs.umass.edu.
44  
45   THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF MASSACHUSETTS AND OTHER CONTRIBUTORS
46   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
47   THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
48   ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
49   LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
50   CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
51   GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
52   HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
53   LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
54   OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
55   SUCH DAMAGE.
56   */
57  package org.apache.lucene.analysis.en;
58  
59  import org.apache.lucene.analysis.util.CharArrayMap;
60  import org.apache.lucene.analysis.util.OpenStringBuilder;
61  /**
62   * <p>Title: Kstemmer</p>
63   * <p>Description: This is a java version of Bob Krovetz' kstem stemmer</p>
64   * <p>Copyright: Copyright 2008, Luicid Imagination, Inc. </p>
65   * <p>Copyright: Copyright 2003, CIIR University of Massachusetts Amherst (http://ciir.cs.umass.edu) </p>
66   */
67  
68  /**
69   * This class implements the Kstem algorithm
70   */
71  public class KStemmer {
72    static private final int MaxWordLen = 50;
73    
74    static private final String[] exceptionWords = {"aide", "bathe", "caste",
75        "cute", "dame", "dime", "doge", "done", "dune", "envelope", "gage",
76        "grille", "grippe", "lobe", "mane", "mare", "nape", "node", "pane",
77        "pate", "plane", "pope", "programme", "quite", "ripe", "rote", "rune",
78        "sage", "severe", "shoppe", "sine", "slime", "snipe", "steppe", "suite",
79        "swinge", "tare", "tine", "tope", "tripe", "twine"};
80    
81    static private final String[][] directConflations = { {"aging", "age"},
82        {"going", "go"}, {"goes", "go"}, {"lying", "lie"}, {"using", "use"},
83        {"owing", "owe"}, {"suing", "sue"}, {"dying", "die"}, {"tying", "tie"},
84        {"vying", "vie"}, {"aged", "age"}, {"used", "use"}, {"vied", "vie"},
85        {"cued", "cue"}, {"died", "die"}, {"eyed", "eye"}, {"hued", "hue"},
86        {"iced", "ice"}, {"lied", "lie"}, {"owed", "owe"}, {"sued", "sue"},
87        {"toed", "toe"}, {"tied", "tie"}, {"does", "do"}, {"doing", "do"},
88        {"aeronautical", "aeronautics"}, {"mathematical", "mathematics"},
89        {"political", "politics"}, {"metaphysical", "metaphysics"},
90        {"cylindrical", "cylinder"}, {"nazism", "nazi"},
91        {"ambiguity", "ambiguous"}, {"barbarity", "barbarous"},
92        {"credulity", "credulous"}, {"generosity", "generous"},
93        {"spontaneity", "spontaneous"}, {"unanimity", "unanimous"},
94        {"voracity", "voracious"}, {"fled", "flee"}, {"miscarriage", "miscarry"}};
95    
96    static private final String[][] countryNationality = {
97        {"afghan", "afghanistan"}, {"african", "africa"},
98        {"albanian", "albania"}, {"algerian", "algeria"},
99        {"american", "america"}, {"andorran", "andorra"}, {"angolan", "angola"},
100       {"arabian", "arabia"}, {"argentine", "argentina"},
101       {"armenian", "armenia"}, {"asian", "asia"}, {"australian", "australia"},
102       {"austrian", "austria"}, {"azerbaijani", "azerbaijan"},
103       {"azeri", "azerbaijan"}, {"bangladeshi", "bangladesh"},
104       {"belgian", "belgium"}, {"bermudan", "bermuda"}, {"bolivian", "bolivia"},
105       {"bosnian", "bosnia"}, {"botswanan", "botswana"},
106       {"brazilian", "brazil"}, {"british", "britain"},
107       {"bulgarian", "bulgaria"}, {"burmese", "burma"},
108       {"californian", "california"}, {"cambodian", "cambodia"},
109       {"canadian", "canada"}, {"chadian", "chad"}, {"chilean", "chile"},
110       {"chinese", "china"}, {"colombian", "colombia"}, {"croat", "croatia"},
111       {"croatian", "croatia"}, {"cuban", "cuba"}, {"cypriot", "cyprus"},
112       {"czechoslovakian", "czechoslovakia"}, {"danish", "denmark"},
113       {"egyptian", "egypt"}, {"equadorian", "equador"},
114       {"eritrean", "eritrea"}, {"estonian", "estonia"},
115       {"ethiopian", "ethiopia"}, {"european", "europe"}, {"fijian", "fiji"},
116       {"filipino", "philippines"}, {"finnish", "finland"},
117       {"french", "france"}, {"gambian", "gambia"}, {"georgian", "georgia"},
118       {"german", "germany"}, {"ghanian", "ghana"}, {"greek", "greece"},
119       {"grenadan", "grenada"}, {"guamian", "guam"},
120       {"guatemalan", "guatemala"}, {"guinean", "guinea"},
121       {"guyanan", "guyana"}, {"haitian", "haiti"}, {"hawaiian", "hawaii"},
122       {"holland", "dutch"}, {"honduran", "honduras"}, {"hungarian", "hungary"},
123       {"icelandic", "iceland"}, {"indonesian", "indonesia"},
124       {"iranian", "iran"}, {"iraqi", "iraq"}, {"iraqui", "iraq"},
125       {"irish", "ireland"}, {"israeli", "israel"},
126       {"italian", "italy"},
127       {"jamaican", "jamaica"},
128       {"japanese", "japan"},
129       {"jordanian", "jordan"},
130       {"kampuchean", "cambodia"},
131       {"kenyan", "kenya"},
132       {"korean", "korea"},
133       {"kuwaiti", "kuwait"},
134       {"lankan", "lanka"},
135       {"laotian", "laos"},
136       {"latvian", "latvia"},
137       {"lebanese", "lebanon"},
138       {"liberian", "liberia"},
139       {"libyan", "libya"},
140       {"lithuanian", "lithuania"},
141       {"macedonian", "macedonia"},
142       {"madagascan", "madagascar"},
143       {"malaysian", "malaysia"},
144       {"maltese", "malta"},
145       {"mauritanian", "mauritania"},
146       {"mexican", "mexico"},
147       {"micronesian", "micronesia"},
148       {"moldovan", "moldova"},
149       {"monacan", "monaco"},
150       {"mongolian", "mongolia"},
151       {"montenegran", "montenegro"},
152       {"moroccan", "morocco"},
153       {"myanmar", "burma"},
154       {"namibian", "namibia"},
155       {"nepalese", "nepal"},
156       // {"netherlands", "dutch"},
157       {"nicaraguan", "nicaragua"}, {"nigerian", "nigeria"},
158       {"norwegian", "norway"}, {"omani", "oman"}, {"pakistani", "pakistan"},
159       {"panamanian", "panama"}, {"papuan", "papua"},
160       {"paraguayan", "paraguay"}, {"peruvian", "peru"},
161       {"portuguese", "portugal"}, {"romanian", "romania"},
162       {"rumania", "romania"}, {"rumanian", "romania"}, {"russian", "russia"},
163       {"rwandan", "rwanda"}, {"samoan", "samoa"}, {"scottish", "scotland"},
164       {"serb", "serbia"}, {"serbian", "serbia"}, {"siam", "thailand"},
165       {"siamese", "thailand"}, {"slovakia", "slovak"}, {"slovakian", "slovak"},
166       {"slovenian", "slovenia"}, {"somali", "somalia"},
167       {"somalian", "somalia"}, {"spanish", "spain"}, {"swedish", "sweden"},
168       {"swiss", "switzerland"}, {"syrian", "syria"}, {"taiwanese", "taiwan"},
169       {"tanzanian", "tanzania"}, {"texan", "texas"}, {"thai", "thailand"},
170       {"tunisian", "tunisia"}, {"turkish", "turkey"}, {"ugandan", "uganda"},
171       {"ukrainian", "ukraine"}, {"uruguayan", "uruguay"},
172       {"uzbek", "uzbekistan"}, {"venezuelan", "venezuela"},
173       {"vietnamese", "viet"}, {"virginian", "virginia"}, {"yemeni", "yemen"},
174       {"yugoslav", "yugoslavia"}, {"yugoslavian", "yugoslavia"},
175       {"zambian", "zambia"}, {"zealander", "zealand"},
176       {"zimbabwean", "zimbabwe"}};
177   
178   static private final String[] supplementDict = {"aids", "applicator",
179       "capacitor", "digitize", "electromagnet", "ellipsoid", "exosphere",
180       "extensible", "ferromagnet", "graphics", "hydromagnet", "polygraph",
181       "toroid", "superconduct", "backscatter", "connectionism"};
182   
183   static private final String[] properNouns = {"abrams", "achilles",
184       "acropolis", "adams", "agnes", "aires", "alexander", "alexis", "alfred",
185       "algiers", "alps", "amadeus", "ames", "amos", "andes", "angeles",
186       "annapolis", "antilles", "aquarius", "archimedes", "arkansas", "asher",
187       "ashly", "athens", "atkins", "atlantis", "avis", "bahamas", "bangor",
188       "barbados", "barger", "bering", "brahms", "brandeis", "brussels",
189       "bruxelles", "cairns", "camoros", "camus", "carlos", "celts", "chalker",
190       "charles", "cheops", "ching", "christmas", "cocos", "collins",
191       "columbus", "confucius", "conners", "connolly", "copernicus", "cramer",
192       "cyclops", "cygnus", "cyprus", "dallas", "damascus", "daniels", "davies",
193       "davis", "decker", "denning", "dennis", "descartes", "dickens", "doris",
194       "douglas", "downs", "dreyfus", "dukakis", "dulles", "dumfries",
195       "ecclesiastes", "edwards", "emily", "erasmus", "euphrates", "evans",
196       "everglades", "fairbanks", "federales", "fisher", "fitzsimmons",
197       "fleming", "forbes", "fowler", "france", "francis", "goering",
198       "goodling", "goths", "grenadines", "guiness", "hades", "harding",
199       "harris", "hastings", "hawkes", "hawking", "hayes", "heights",
200       "hercules", "himalayas", "hippocrates", "hobbs", "holmes", "honduras",
201       "hopkins", "hughes", "humphreys", "illinois", "indianapolis",
202       "inverness", "iris", "iroquois", "irving", "isaacs", "italy", "james",
203       "jarvis", "jeffreys", "jesus", "jones", "josephus", "judas", "julius",
204       "kansas", "keynes", "kipling", "kiwanis", "lansing", "laos", "leeds",
205       "levis", "leviticus", "lewis", "louis", "maccabees", "madras",
206       "maimonides", "maldive", "massachusetts", "matthews", "mauritius",
207       "memphis", "mercedes", "midas", "mingus", "minneapolis", "mohammed",
208       "moines", "morris", "moses", "myers", "myknos", "nablus", "nanjing",
209       "nantes", "naples", "neal", "netherlands", "nevis", "nostradamus",
210       "oedipus", "olympus", "orleans", "orly", "papas", "paris", "parker",
211       "pauling", "peking", "pershing", "peter", "peters", "philippines",
212       "phineas", "pisces", "pryor", "pythagoras", "queens", "rabelais",
213       "ramses", "reynolds", "rhesus", "rhodes", "richards", "robins",
214       "rodgers", "rogers", "rubens", "sagittarius", "seychelles", "socrates",
215       "texas", "thames", "thomas", "tiberias", "tunis", "venus", "vilnius",
216       "wales", "warner", "wilkins", "williams", "wyoming", "xmas", "yonkers",
217       "zeus", "frances", "aarhus", "adonis", "andrews", "angus", "antares",
218       "aquinas", "arcturus", "ares", "artemis", "augustus", "ayers",
219       "barnabas", "barnes", "becker", "bejing", "biggs", "billings", "boeing",
220       "boris", "borroughs", "briggs", "buenos", "calais", "caracas", "cassius",
221       "cerberus", "ceres", "cervantes", "chantilly", "chartres", "chester",
222       "connally", "conner", "coors", "cummings", "curtis", "daedalus",
223       "dionysus", "dobbs", "dolores", "edmonds"};
224   
225   static class DictEntry {
226     boolean exception;
227     String root;
228     
229     DictEntry(String root, boolean isException) {
230       this.root = root;
231       this.exception = isException;
232     }
233   }
234   
235   private static final CharArrayMap<DictEntry> dict_ht = initializeDictHash();
236   
237   /***
238    * caching off private int maxCacheSize; private CharArrayMap<String> cache =
239    * null; private static final String SAME = "SAME"; // use if stemmed form is
240    * the same
241    ***/
242   
243   private final OpenStringBuilder word = new OpenStringBuilder();
244   private int j; /* index of final letter in stem (within word) */
245   private int k; /*
246                   * INDEX of final letter in word. You must add 1 to k to get
247                   * the current length of word. When you want the length of
248                   * word, use the method wordLength, which returns (k+1).
249                   */
250   
251   /*
252    * private void initializeStemHash() { if (maxCacheSize > 0) cache = new
253    * CharArrayMap<String>(maxCacheSize,false); }
254    ***/
255   
256   private char finalChar() {
257     return word.charAt(k);
258   }
259   
260   private char penultChar() {
261     return word.charAt(k - 1);
262   }
263   
264   private boolean isVowel(int index) {
265     return !isCons(index);
266   }
267   
268   private boolean isCons(int index) {
269     char ch;
270     
271     ch = word.charAt(index);
272     
273     if ((ch == 'a') || (ch == 'e') || (ch == 'i') || (ch == 'o') || (ch == 'u')) return false;
274     if ((ch != 'y') || (index == 0)) return true;
275     else return (!isCons(index - 1));
276   }
277   
278   private static CharArrayMap<DictEntry> initializeDictHash() {
279     DictEntry defaultEntry;
280     DictEntry entry;
281 
282     CharArrayMap<DictEntry> d = new CharArrayMap<>(1000, false);
283     for (int i = 0; i < exceptionWords.length; i++) {
284       if (!d.containsKey(exceptionWords[i])) {
285         entry = new DictEntry(exceptionWords[i], true);
286         d.put(exceptionWords[i], entry);
287       } else {
288         throw new RuntimeException("Warning: Entry [" + exceptionWords[i]
289             + "] already in dictionary 1");
290       }
291     }
292     
293     for (int i = 0; i < directConflations.length; i++) {
294       if (!d.containsKey(directConflations[i][0])) {
295         entry = new DictEntry(directConflations[i][1], false);
296         d.put(directConflations[i][0], entry);
297       } else {
298         throw new RuntimeException("Warning: Entry [" + directConflations[i][0]
299             + "] already in dictionary 2");
300       }
301     }
302     
303     for (int i = 0; i < countryNationality.length; i++) {
304       if (!d.containsKey(countryNationality[i][0])) {
305         entry = new DictEntry(countryNationality[i][1], false);
306         d.put(countryNationality[i][0], entry);
307       } else {
308         throw new RuntimeException("Warning: Entry [" + countryNationality[i][0]
309             + "] already in dictionary 3");
310       }
311     }
312     
313     defaultEntry = new DictEntry(null, false);
314     
315     String[] array;
316     array = KStemData1.data;
317     
318     for (int i = 0; i < array.length; i++) {
319       if (!d.containsKey(array[i])) {
320         d.put(array[i], defaultEntry);
321       } else {
322         throw new RuntimeException("Warning: Entry [" + array[i]
323             + "] already in dictionary 4");
324       }
325     }
326     
327     array = KStemData2.data;
328     for (int i = 0; i < array.length; i++) {
329       if (!d.containsKey(array[i])) {
330         d.put(array[i], defaultEntry);
331       } else {
332         throw new RuntimeException("Warning: Entry [" + array[i]
333             + "] already in dictionary 4");
334       }
335     }
336     
337     array = KStemData3.data;
338     for (int i = 0; i < array.length; i++) {
339       if (!d.containsKey(array[i])) {
340         d.put(array[i], defaultEntry);
341       } else {
342         throw new RuntimeException("Warning: Entry [" + array[i]
343             + "] already in dictionary 4");
344       }
345     }
346     
347     array = KStemData4.data;
348     for (int i = 0; i < array.length; i++) {
349       if (!d.containsKey(array[i])) {
350         d.put(array[i], defaultEntry);
351       } else {
352         throw new RuntimeException("Warning: Entry [" + array[i]
353             + "] already in dictionary 4");
354       }
355     }
356     
357     array = KStemData5.data;
358     for (int i = 0; i < array.length; i++) {
359       if (!d.containsKey(array[i])) {
360         d.put(array[i], defaultEntry);
361       } else {
362         throw new RuntimeException("Warning: Entry [" + array[i]
363             + "] already in dictionary 4");
364       }
365     }
366     
367     array = KStemData6.data;
368     for (int i = 0; i < array.length; i++) {
369       if (!d.containsKey(array[i])) {
370         d.put(array[i], defaultEntry);
371       } else {
372         throw new RuntimeException("Warning: Entry [" + array[i]
373             + "] already in dictionary 4");
374       }
375     }
376     
377     array = KStemData7.data;
378     for (int i = 0; i < array.length; i++) {
379       if (!d.containsKey(array[i])) {
380         d.put(array[i], defaultEntry);
381       } else {
382         throw new RuntimeException("Warning: Entry [" + array[i]
383             + "] already in dictionary 4");
384       }
385     }
386     
387     for (int i = 0; i < KStemData8.data.length; i++) {
388       if (!d.containsKey(KStemData8.data[i])) {
389         d.put(KStemData8.data[i], defaultEntry);
390       } else {
391         throw new RuntimeException("Warning: Entry [" + KStemData8.data[i]
392             + "] already in dictionary 4");
393       }
394     }
395     
396     for (int i = 0; i < supplementDict.length; i++) {
397       if (!d.containsKey(supplementDict[i])) {
398         d.put(supplementDict[i], defaultEntry);
399       } else {
400         throw new RuntimeException("Warning: Entry [" + supplementDict[i]
401             + "] already in dictionary 5");
402       }
403     }
404     
405     for (int i = 0; i < properNouns.length; i++) {
406       if (!d.containsKey(properNouns[i])) {
407         d.put(properNouns[i], defaultEntry);
408       } else {
409         throw new RuntimeException("Warning: Entry [" + properNouns[i]
410             + "] already in dictionary 6");
411       }
412     }
413     
414     return d;
415   }
416   
417   private boolean isAlpha(char ch) {
418     return ch >= 'a' && ch <= 'z'; // terms must be lowercased already
419   }
420   
421   /* length of stem within word */
422   private int stemLength() {
423     return j + 1;
424   };
425   
426   private boolean endsIn(char[] s) {
427     if (s.length > k) return false;
428     
429     int r = word.length() - s.length; /* length of word before this suffix */
430     j = k;
431     for (int r1 = r, i = 0; i < s.length; i++, r1++) {
432       if (s[i] != word.charAt(r1)) return false;
433     }
434     j = r - 1; /* index of the character BEFORE the posfix */
435     return true;
436   }
437   
438   private boolean endsIn(char a, char b) {
439     if (2 > k) return false;
440     // check left to right since the endings have often already matched
441     if (word.charAt(k - 1) == a && word.charAt(k) == b) {
442       j = k - 2;
443       return true;
444     }
445     return false;
446   }
447   
448   private boolean endsIn(char a, char b, char c) {
449     if (3 > k) return false;
450     if (word.charAt(k - 2) == a && word.charAt(k - 1) == b
451         && word.charAt(k) == c) {
452       j = k - 3;
453       return true;
454     }
455     return false;
456   }
457   
458   private boolean endsIn(char a, char b, char c, char d) {
459     if (4 > k) return false;
460     if (word.charAt(k - 3) == a && word.charAt(k - 2) == b
461         && word.charAt(k - 1) == c && word.charAt(k) == d) {
462       j = k - 4;
463       return true;
464     }
465     return false;
466   }
467   
468   private DictEntry wordInDict() {
469     /***
470      * if (matchedEntry != null) { if (dict_ht.get(word.getArray(), 0,
471      * word.size()) != matchedEntry) {
472      * System.out.println("Uh oh... cached entry doesn't match"); } return
473      * matchedEntry; }
474      ***/
475     if (matchedEntry != null) return matchedEntry;
476     DictEntry e = dict_ht.get(word.getArray(), 0, word.length());
477     if (e != null && !e.exception) {
478       matchedEntry = e; // only cache if it's not an exception.
479     }
480     // lookups.add(word.toString());
481     return e;
482   }
483   
484   /* Convert plurals to singular form, and '-ies' to 'y' */
485   private void plural() {
486     if (word.charAt(k) == 's') {
487       if (endsIn('i', 'e', 's')) {
488         word.setLength(j + 3);
489         k--;
490         if (lookup()) /* ensure calories -> calorie */
491         return;
492         k++;
493         word.unsafeWrite('s');
494         setSuffix("y");
495         lookup();
496       } else if (endsIn('e', 's')) {
497         /* try just removing the "s" */
498         word.setLength(j + 2);
499         k--;
500         
501         /*
502          * note: don't check for exceptions here. So, `aides' -> `aide', but
503          * `aided' -> `aid'. The exception for double s is used to prevent
504          * crosses -> crosse. This is actually correct if crosses is a plural
505          * noun (a type of racket used in lacrosse), but the verb is much more
506          * common
507          */
508 
509         /****
510          * YCS: this was the one place where lookup was not followed by return.
511          * So restructure it. if ((j>0)&&(lookup(word.toString())) &&
512          * !((word.charAt(j) == 's') && (word.charAt(j-1) == 's'))) return;
513          *****/
514         boolean tryE = j > 0
515             && !((word.charAt(j) == 's') && (word.charAt(j - 1) == 's'));
516         if (tryE && lookup()) return;
517         
518         /* try removing the "es" */
519 
520         word.setLength(j + 1);
521         k--;
522         if (lookup()) return;
523         
524         /* the default is to retain the "e" */
525         word.unsafeWrite('e');
526         k++;
527         
528         if (!tryE) lookup(); // if we didn't try the "e" ending before
529         return;
530       } else {
531         if (word.length() > 3 && penultChar() != 's' && !endsIn('o', 'u', 's')) {
532           /* unless the word ends in "ous" or a double "s", remove the final "s" */
533 
534           word.setLength(k);
535           k--;
536           lookup();
537         }
538       }
539     }
540   }
541   
542   private void setSuffix(String s) {
543     setSuff(s, s.length());
544   }
545   
546   /* replace old suffix with s */
547   private void setSuff(String s, int len) {
548     word.setLength(j + 1);
549     for (int l = 0; l < len; l++) {
550       word.unsafeWrite(s.charAt(l));
551     }
552     k = j + len;
553   }
554   
555   /* Returns true if the word is found in the dictionary */
556   // almost all uses of lookup() return immediately and are
557   // followed by another lookup in the dict. Store the match
558   // to avoid this double lookup.
559   DictEntry matchedEntry = null;
560   
561   private boolean lookup() {
562     /******
563      * debugging code String thisLookup = word.toString(); boolean added =
564      * lookups.add(thisLookup); if (!added) {
565      * System.out.println("######extra lookup:" + thisLookup); // occaasional
566      * extra lookups aren't necessarily errors... could happen by diff
567      * manipulations // throw new RuntimeException("######extra lookup:" +
568      * thisLookup); } else { // System.out.println("new lookup:" + thisLookup);
569      * }
570      ******/
571     
572     matchedEntry = dict_ht.get(word.getArray(), 0, word.size());
573     return matchedEntry != null;
574   }
575   
576   // Set<String> lookups = new HashSet<>();
577   
578   /* convert past tense (-ed) to present, and `-ied' to `y' */
579   private void pastTense() {
580     /*
581      * Handle words less than 5 letters with a direct mapping This prevents
582      * (fled -> fl).
583      */
584     if (word.length() <= 4) return;
585     
586     if (endsIn('i', 'e', 'd')) {
587       word.setLength(j + 3);
588       k--;
589       if (lookup()) /* we almost always want to convert -ied to -y, but */
590       return; /* this isn't true for short words (died->die) */
591       k++; /* I don't know any long words that this applies to, */
592       word.unsafeWrite('d'); /* but just in case... */
593       setSuffix("y");
594       lookup();
595       return;
596     }
597     
598     /* the vowelInStem() is necessary so we don't stem acronyms */
599     if (endsIn('e', 'd') && vowelInStem()) {
600       /* see if the root ends in `e' */
601       word.setLength(j + 2);
602       k = j + 1;
603       
604       DictEntry entry = wordInDict();
605       if (entry != null) if (!entry.exception) /*
606                                                 * if it's in the dictionary and
607                                                 * not an exception
608                                                 */
609       return;
610       
611       /* try removing the "ed" */
612       word.setLength(j + 1);
613       k = j;
614       if (lookup()) return;
615       
616       /*
617        * try removing a doubled consonant. if the root isn't found in the
618        * dictionary, the default is to leave it doubled. This will correctly
619        * capture `backfilled' -> `backfill' instead of `backfill' ->
620        * `backfille', and seems correct most of the time
621        */
622 
623       if (doubleC(k)) {
624         word.setLength(k);
625         k--;
626         if (lookup()) return;
627         word.unsafeWrite(word.charAt(k));
628         k++;
629         lookup();
630         return;
631       }
632       
633       /* if we have a `un-' prefix, then leave the word alone */
634       /* (this will sometimes screw up with `under-', but we */
635       /* will take care of that later) */
636 
637       if ((word.charAt(0) == 'u') && (word.charAt(1) == 'n')) {
638         word.unsafeWrite('e');
639         word.unsafeWrite('d');
640         k = k + 2;
641         // nolookup()
642         return;
643       }
644       
645       /*
646        * it wasn't found by just removing the `d' or the `ed', so prefer to end
647        * with an `e' (e.g., `microcoded' -> `microcode').
648        */
649 
650       word.setLength(j + 1);
651       word.unsafeWrite('e');
652       k = j + 1;
653       // nolookup() - we already tried the "e" ending
654       return;
655     }
656   }
657   
658   /* return TRUE if word ends with a double consonant */
659   private boolean doubleC(int i) {
660     if (i < 1) return false;
661     
662     if (word.charAt(i) != word.charAt(i - 1)) return false;
663     return (isCons(i));
664   }
665   
666   private boolean vowelInStem() {
667     for (int i = 0; i < stemLength(); i++) {
668       if (isVowel(i)) return true;
669     }
670     return false;
671   }
672   
673   /* handle `-ing' endings */
674   private void aspect() {
675     /*
676      * handle short words (aging -> age) via a direct mapping. This prevents
677      * (thing -> the) in the version of this routine that ignores inflectional
678      * variants that are mentioned in the dictionary (when the root is also
679      * present)
680      */
681 
682     if (word.length() <= 5) return;
683     
684     /* the vowelinstem() is necessary so we don't stem acronyms */
685     if (endsIn('i', 'n', 'g') && vowelInStem()) {
686       
687       /* try adding an `e' to the stem and check against the dictionary */
688       word.setCharAt(j + 1, 'e');
689       word.setLength(j + 2);
690       k = j + 1;
691       
692       DictEntry entry = wordInDict();
693       if (entry != null) {
694         if (!entry.exception) /* if it's in the dictionary and not an exception */
695         return;
696       }
697       
698       /* adding on the `e' didn't work, so remove it */
699       word.setLength(k);
700       k--; /* note that `ing' has also been removed */
701       
702       if (lookup()) return;
703       
704       /* if I can remove a doubled consonant and get a word, then do so */
705       if (doubleC(k)) {
706         k--;
707         word.setLength(k + 1);
708         if (lookup()) return;
709         word.unsafeWrite(word.charAt(k)); /* restore the doubled consonant */
710         
711         /* the default is to leave the consonant doubled */
712         /* (e.g.,`fingerspelling' -> `fingerspell'). Unfortunately */
713         /* `bookselling' -> `booksell' and `mislabelling' -> `mislabell'). */
714         /* Without making the algorithm significantly more complicated, this */
715         /* is the best I can do */
716         k++;
717         lookup();
718         return;
719       }
720       
721       /*
722        * the word wasn't in the dictionary after removing the stem, and then
723        * checking with and without a final `e'. The default is to add an `e'
724        * unless the word ends in two consonants, so `microcoding' ->
725        * `microcode'. The two consonants restriction wouldn't normally be
726        * necessary, but is needed because we don't try to deal with prefixes and
727        * compounds, and most of the time it is correct (e.g., footstamping ->
728        * footstamp, not footstampe; however, decoupled -> decoupl). We can
729        * prevent almost all of the incorrect stems if we try to do some prefix
730        * analysis first
731        */
732 
733       if ((j > 0) && isCons(j) && isCons(j - 1)) {
734         k = j;
735         word.setLength(k + 1);
736         // nolookup() because we already did according to the comment
737         return;
738       }
739       
740       word.setLength(j + 1);
741       word.unsafeWrite('e');
742       k = j + 1;
743       // nolookup(); we already tried an 'e' ending
744       return;
745     }
746   }
747   
748   /*
749    * this routine deals with -ity endings. It accepts -ability, -ibility, and
750    * -ality, even without checking the dictionary because they are so
751    * productive. The first two are mapped to -ble, and the -ity is remove for
752    * the latter
753    */
754   private void ityEndings() {
755     int old_k = k;
756     
757     if (endsIn('i', 't', 'y')) {
758       word.setLength(j + 1); /* try just removing -ity */
759       k = j;
760       if (lookup()) return;
761       word.unsafeWrite('e'); /* try removing -ity and adding -e */
762       k = j + 1;
763       if (lookup()) return;
764       word.setCharAt(j + 1, 'i');
765       word.append("ty");
766       k = old_k;
767       /*
768        * the -ability and -ibility endings are highly productive, so just accept
769        * them
770        */
771       if ((j > 0) && (word.charAt(j - 1) == 'i') && (word.charAt(j) == 'l')) {
772         word.setLength(j - 1);
773         word.append("le"); /* convert to -ble */
774         k = j;
775         lookup();
776         return;
777       }
778       
779       /* ditto for -ivity */
780       if ((j > 0) && (word.charAt(j - 1) == 'i') && (word.charAt(j) == 'v')) {
781         word.setLength(j + 1);
782         word.unsafeWrite('e'); /* convert to -ive */
783         k = j + 1;
784         lookup();
785         return;
786       }
787       /* ditto for -ality */
788       if ((j > 0) && (word.charAt(j - 1) == 'a') && (word.charAt(j) == 'l')) {
789         word.setLength(j + 1);
790         k = j;
791         lookup();
792         return;
793       }
794       
795       /*
796        * if the root isn't in the dictionary, and the variant *is* there, then
797        * use the variant. This allows `immunity'->`immune', but prevents
798        * `capacity'->`capac'. If neither the variant nor the root form are in
799        * the dictionary, then remove the ending as a default
800        */
801 
802       if (lookup()) return;
803       
804       /* the default is to remove -ity altogether */
805       word.setLength(j + 1);
806       k = j;
807       // nolookup(), we already did it.
808       return;
809     }
810   }
811   
812   /* handle -ence and -ance */
813   private void nceEndings() {
814     int old_k = k;
815     char word_char;
816     
817     if (endsIn('n', 'c', 'e')) {
818       word_char = word.charAt(j);
819       if (!((word_char == 'e') || (word_char == 'a'))) return;
820       word.setLength(j);
821       word.unsafeWrite('e'); /* try converting -e/ance to -e (adherance/adhere) */
822       k = j;
823       if (lookup()) return;
824       word.setLength(j); /*
825                           * try removing -e/ance altogether
826                           * (disappearance/disappear)
827                           */
828       k = j - 1;
829       if (lookup()) return;
830       word.unsafeWrite(word_char); /* restore the original ending */
831       word.append("nce");
832       k = old_k;
833       // nolookup() because we restored the original ending
834     }
835     return;
836   }
837   
838   /* handle -ness */
839   private void nessEndings() {
840     if (endsIn('n', 'e', 's', 's')) { /*
841                                        * this is a very productive endings, so
842                                        * just accept it
843                                        */
844       word.setLength(j + 1);
845       k = j;
846       if (word.charAt(j) == 'i') word.setCharAt(j, 'y');
847       lookup();
848     }
849     return;
850   }
851   
852   /* handle -ism */
853   private void ismEndings() {
854     if (endsIn('i', 's', 'm')) { /*
855                                   * this is a very productive ending, so just
856                                   * accept it
857                                   */
858       word.setLength(j + 1);
859       k = j;
860       lookup();
861     }
862     return;
863   }
864   
865   /* this routine deals with -ment endings. */
866   private void mentEndings() {
867     int old_k = k;
868     
869     if (endsIn('m', 'e', 'n', 't')) {
870       word.setLength(j + 1);
871       k = j;
872       if (lookup()) return;
873       word.append("ment");
874       k = old_k;
875       // nolookup
876     }
877     return;
878   }
879   
880   /* this routine deals with -ize endings. */
881   private void izeEndings() {
882     int old_k = k;
883     
884     if (endsIn('i', 'z', 'e')) {
885       word.setLength(j + 1); /* try removing -ize entirely */
886       k = j;
887       if (lookup()) return;
888       word.unsafeWrite('i');
889       
890       if (doubleC(j)) { /* allow for a doubled consonant */
891         word.setLength(j);
892         k = j - 1;
893         if (lookup()) return;
894         word.unsafeWrite(word.charAt(j - 1));
895       }
896       
897       word.setLength(j + 1);
898       word.unsafeWrite('e'); /* try removing -ize and adding -e */
899       k = j + 1;
900       if (lookup()) return;
901       word.setLength(j + 1);
902       word.append("ize");
903       k = old_k;
904       // nolookup()
905     }
906     return;
907   }
908   
909   /* handle -ency and -ancy */
910   private void ncyEndings() {
911     if (endsIn('n', 'c', 'y')) {
912       if (!((word.charAt(j) == 'e') || (word.charAt(j) == 'a'))) return;
913       word.setCharAt(j + 2, 't'); /* try converting -ncy to -nt */
914       word.setLength(j + 3);
915       k = j + 2;
916       
917       if (lookup()) return;
918       
919       word.setCharAt(j + 2, 'c'); /* the default is to convert it to -nce */
920       word.unsafeWrite('e');
921       k = j + 3;
922       lookup();
923     }
924     return;
925   }
926   
927   /* handle -able and -ible */
928   private void bleEndings() {
929     int old_k = k;
930     char word_char;
931     
932     if (endsIn('b', 'l', 'e')) {
933       if (!((word.charAt(j) == 'a') || (word.charAt(j) == 'i'))) return;
934       word_char = word.charAt(j);
935       word.setLength(j); /* try just removing the ending */
936       k = j - 1;
937       if (lookup()) return;
938       if (doubleC(k)) { /* allow for a doubled consonant */
939         word.setLength(k);
940         k--;
941         if (lookup()) return;
942         k++;
943         word.unsafeWrite(word.charAt(k - 1));
944       }
945       word.setLength(j);
946       word.unsafeWrite('e'); /* try removing -a/ible and adding -e */
947       k = j;
948       if (lookup()) return;
949       word.setLength(j);
950       word.append("ate"); /* try removing -able and adding -ate */
951       /* (e.g., compensable/compensate) */
952       k = j + 2;
953       if (lookup()) return;
954       word.setLength(j);
955       word.unsafeWrite(word_char); /* restore the original values */
956       word.append("ble");
957       k = old_k;
958       // nolookup()
959     }
960     return;
961   }
962   
963   /*
964    * handle -ic endings. This is fairly straightforward, but this is also the
965    * only place we try *expanding* an ending, -ic -> -ical. This is to handle
966    * cases like `canonic' -> `canonical'
967    */
968   private void icEndings() {
969     if (endsIn('i', 'c')) {
970       word.setLength(j + 3);
971       word.append("al"); /* try converting -ic to -ical */
972       k = j + 4;
973       if (lookup()) return;
974       
975       word.setCharAt(j + 1, 'y'); /* try converting -ic to -y */
976       word.setLength(j + 2);
977       k = j + 1;
978       if (lookup()) return;
979       
980       word.setCharAt(j + 1, 'e'); /* try converting -ic to -e */
981       if (lookup()) return;
982       
983       word.setLength(j + 1); /* try removing -ic altogether */
984       k = j;
985       if (lookup()) return;
986       word.append("ic"); /* restore the original ending */
987       k = j + 2;
988       // nolookup()
989     }
990     return;
991   }
992   
993   private static char[] ization = "ization".toCharArray();
994   private static char[] ition = "ition".toCharArray();
995   private static char[] ation = "ation".toCharArray();
996   private static char[] ication = "ication".toCharArray();
997   
998   /* handle some derivational endings */
999   /*
1000    * this routine deals with -ion, -ition, -ation, -ization, and -ication. The
1001    * -ization ending is always converted to -ize
1002    */
1003   private void ionEndings() {
1004     int old_k = k;
1005     if (!endsIn('i', 'o', 'n')) {
1006       return;
1007     }
1008     
1009     if (endsIn(ization)) { /*
1010                             * the -ize ending is very productive, so simply
1011                             * accept it as the root
1012                             */
1013       word.setLength(j + 3);
1014       word.unsafeWrite('e');
1015       k = j + 3;
1016       lookup();
1017       return;
1018     }
1019     
1020     if (endsIn(ition)) {
1021       word.setLength(j + 1);
1022       word.unsafeWrite('e');
1023       k = j + 1;
1024       if (lookup()) /*
1025                      * remove -ition and add `e', and check against the
1026                      * dictionary
1027                      */
1028       return; /* (e.g., definition->define, opposition->oppose) */
1029       
1030       /* restore original values */
1031       word.setLength(j + 1);
1032       word.append("ition");
1033       k = old_k;
1034       // nolookup()
1035     } else if (endsIn(ation)) {
1036       word.setLength(j + 3);
1037       word.unsafeWrite('e');
1038       k = j + 3;
1039       if (lookup()) /* remove -ion and add `e', and check against the dictionary */
1040       return; /* (elmination -> eliminate) */
1041       
1042       word.setLength(j + 1);
1043       word.unsafeWrite('e'); /*
1044                               * remove -ation and add `e', and check against the
1045                               * dictionary
1046                               */
1047       k = j + 1;
1048       if (lookup()) return;
1049       
1050       word.setLength(j + 1);/*
1051                              * just remove -ation (resignation->resign) and
1052                              * check dictionary
1053                              */
1054       k = j;
1055       if (lookup()) return;
1056       
1057       /* restore original values */
1058       word.setLength(j + 1);
1059       word.append("ation");
1060       k = old_k;
1061       // nolookup()
1062       
1063     }
1064     
1065     /*
1066      * test -ication after -ation is attempted (e.g., `complication->complicate'
1067      * rather than `complication->comply')
1068      */
1069 
1070     if (endsIn(ication)) {
1071       word.setLength(j + 1);
1072       word.unsafeWrite('y');
1073       k = j + 1;
1074       if (lookup()) /*
1075                      * remove -ication and add `y', and check against the
1076                      * dictionary
1077                      */
1078       return; /* (e.g., amplification -> amplify) */
1079       
1080       /* restore original values */
1081       word.setLength(j + 1);
1082       word.append("ication");
1083       k = old_k;
1084       // nolookup()
1085     }
1086     
1087     // if (endsIn(ion)) {
1088     if (true) { // we checked for this earlier... just need to set "j"
1089       j = k - 3; // YCS
1090       
1091       word.setLength(j + 1);
1092       word.unsafeWrite('e');
1093       k = j + 1;
1094       if (lookup()) /* remove -ion and add `e', and check against the dictionary */
1095       return;
1096       
1097       word.setLength(j + 1);
1098       k = j;
1099       if (lookup()) /* remove -ion, and if it's found, treat that as the root */
1100       return;
1101       
1102       /* restore original values */
1103       word.setLength(j + 1);
1104       word.append("ion");
1105       k = old_k;
1106       // nolookup()
1107     }
1108     
1109     // nolookup(); all of the other paths restored original values
1110     return;
1111   }
1112   
1113   /*
1114    * this routine deals with -er, -or, -ier, and -eer. The -izer ending is
1115    * always converted to -ize
1116    */
1117   private void erAndOrEndings() {
1118     int old_k = k;
1119     
1120     if (word.charAt(k) != 'r') return; // YCS
1121     
1122     char word_char; /* so we can remember if it was -er or -or */
1123     
1124     if (endsIn('i', 'z', 'e', 'r')) { /*
1125                                        * -ize is very productive, so accept it
1126                                        * as the root
1127                                        */
1128       word.setLength(j + 4);
1129       k = j + 3;
1130       lookup();
1131       return;
1132     }
1133     
1134     if (endsIn('e', 'r') || endsIn('o', 'r')) {
1135       word_char = word.charAt(j + 1);
1136       if (doubleC(j)) {
1137         word.setLength(j);
1138         k = j - 1;
1139         if (lookup()) return;
1140         word.unsafeWrite(word.charAt(j - 1)); /* restore the doubled consonant */
1141       }
1142       
1143       if (word.charAt(j) == 'i') { /* do we have a -ier ending? */
1144         word.setCharAt(j, 'y');
1145         word.setLength(j + 1);
1146         k = j;
1147         if (lookup()) /* yes, so check against the dictionary */
1148         return;
1149         word.setCharAt(j, 'i'); /* restore the endings */
1150         word.unsafeWrite('e');
1151       }
1152       
1153       if (word.charAt(j) == 'e') { /* handle -eer */
1154         word.setLength(j);
1155         k = j - 1;
1156         if (lookup()) return;
1157         word.unsafeWrite('e');
1158       }
1159       
1160       word.setLength(j + 2); /* remove the -r ending */
1161       k = j + 1;
1162       if (lookup()) return;
1163       word.setLength(j + 1); /* try removing -er/-or */
1164       k = j;
1165       if (lookup()) return;
1166       word.unsafeWrite('e'); /* try removing -or and adding -e */
1167       k = j + 1;
1168       if (lookup()) return;
1169       word.setLength(j + 1);
1170       word.unsafeWrite(word_char);
1171       word.unsafeWrite('r'); /* restore the word to the way it was */
1172       k = old_k;
1173       // nolookup()
1174     }
1175     
1176   }
1177   
1178   /*
1179    * this routine deals with -ly endings. The -ally ending is always converted
1180    * to -al Sometimes this will temporarily leave us with a non-word (e.g.,
1181    * heuristically maps to heuristical), but then the -al is removed in the next
1182    * step.
1183    */
1184   private void lyEndings() {
1185     int old_k = k;
1186     
1187     if (endsIn('l', 'y')) {
1188       
1189       word.setCharAt(j + 2, 'e'); /* try converting -ly to -le */
1190       
1191       if (lookup()) return;
1192       word.setCharAt(j + 2, 'y');
1193       
1194       word.setLength(j + 1); /* try just removing the -ly */
1195       k = j;
1196       
1197       if (lookup()) return;
1198       
1199       if ((j > 0) && (word.charAt(j - 1) == 'a') && (word.charAt(j) == 'l')) /*
1200                                                                               * always
1201                                                                               * convert
1202                                                                               * -
1203                                                                               * ally
1204                                                                               * to
1205                                                                               * -
1206                                                                               * al
1207                                                                               */
1208       return;
1209       word.append("ly");
1210       k = old_k;
1211       
1212       if ((j > 0) && (word.charAt(j - 1) == 'a') && (word.charAt(j) == 'b')) { /*
1213                                                                                 * always
1214                                                                                 * convert
1215                                                                                 * -
1216                                                                                 * ably
1217                                                                                 * to
1218                                                                                 * -
1219                                                                                 * able
1220                                                                                 */
1221         word.setCharAt(j + 2, 'e');
1222         k = j + 2;
1223         return;
1224       }
1225       
1226       if (word.charAt(j) == 'i') { /* e.g., militarily -> military */
1227         word.setLength(j);
1228         word.unsafeWrite('y');
1229         k = j;
1230         if (lookup()) return;
1231         word.setLength(j);
1232         word.append("ily");
1233         k = old_k;
1234       }
1235       
1236       word.setLength(j + 1); /* the default is to remove -ly */
1237       
1238       k = j;
1239       // nolookup()... we already tried removing the "ly" variant
1240     }
1241     return;
1242   }
1243   
1244   /*
1245    * this routine deals with -al endings. Some of the endings from the previous
1246    * routine are finished up here.
1247    */
1248   private void alEndings() {
1249     int old_k = k;
1250     
1251     if (word.length() < 4) return;
1252     if (endsIn('a', 'l')) {
1253       word.setLength(j + 1);
1254       k = j;
1255       if (lookup()) /* try just removing the -al */
1256       return;
1257       
1258       if (doubleC(j)) { /* allow for a doubled consonant */
1259         word.setLength(j);
1260         k = j - 1;
1261         if (lookup()) return;
1262         word.unsafeWrite(word.charAt(j - 1));
1263       }
1264       
1265       word.setLength(j + 1);
1266       word.unsafeWrite('e'); /* try removing the -al and adding -e */
1267       k = j + 1;
1268       if (lookup()) return;
1269       
1270       word.setLength(j + 1);
1271       word.append("um"); /* try converting -al to -um */
1272       /* (e.g., optimal - > optimum ) */
1273       k = j + 2;
1274       if (lookup()) return;
1275       
1276       word.setLength(j + 1);
1277       word.append("al"); /* restore the ending to the way it was */
1278       k = old_k;
1279       
1280       if ((j > 0) && (word.charAt(j - 1) == 'i') && (word.charAt(j) == 'c')) {
1281         word.setLength(j - 1); /* try removing -ical */
1282         k = j - 2;
1283         if (lookup()) return;
1284         
1285         word.setLength(j - 1);
1286         word.unsafeWrite('y');/* try turning -ical to -y (e.g., bibliographical) */
1287         k = j - 1;
1288         if (lookup()) return;
1289         
1290         word.setLength(j - 1);
1291         word.append("ic"); /* the default is to convert -ical to -ic */
1292         k = j;
1293         // nolookup() ... converting ical to ic means removing "al" which we
1294         // already tried
1295         // ERROR
1296         lookup();
1297         return;
1298       }
1299       
1300       if (word.charAt(j) == 'i') { /* sometimes -ial endings should be removed */
1301         word.setLength(j); /* (sometimes it gets turned into -y, but we */
1302         k = j - 1; /* aren't dealing with that case for now) */
1303         if (lookup()) return;
1304         word.append("ial");
1305         k = old_k;
1306         lookup();
1307       }
1308       
1309     }
1310     return;
1311   }
1312   
1313   /*
1314    * this routine deals with -ive endings. It normalizes some of the -ative
1315    * endings directly, and also maps some -ive endings to -ion.
1316    */
1317   private void iveEndings() {
1318     int old_k = k;
1319     
1320     if (endsIn('i', 'v', 'e')) {
1321       word.setLength(j + 1); /* try removing -ive entirely */
1322       k = j;
1323       if (lookup()) return;
1324       
1325       word.unsafeWrite('e'); /* try removing -ive and adding -e */
1326       k = j + 1;
1327       if (lookup()) return;
1328       word.setLength(j + 1);
1329       word.append("ive");
1330       if ((j > 0) && (word.charAt(j - 1) == 'a') && (word.charAt(j) == 't')) {
1331         word.setCharAt(j - 1, 'e'); /* try removing -ative and adding -e */
1332         word.setLength(j); /* (e.g., determinative -> determine) */
1333         k = j - 1;
1334         if (lookup()) return;
1335         word.setLength(j - 1); /* try just removing -ative */
1336         if (lookup()) return;
1337         
1338         word.append("ative");
1339         k = old_k;
1340       }
1341       
1342       /* try mapping -ive to -ion (e.g., injunctive/injunction) */
1343       word.setCharAt(j + 2, 'o');
1344       word.setCharAt(j + 3, 'n');
1345       if (lookup()) return;
1346       
1347       word.setCharAt(j + 2, 'v'); /* restore the original values */
1348       word.setCharAt(j + 3, 'e');
1349       k = old_k;
1350       // nolookup()
1351     }
1352     return;
1353   }
1354   
1355   KStemmer() {}
1356   
1357   String stem(String term) {
1358     boolean changed = stem(term.toCharArray(), term.length());
1359     if (!changed) return term;
1360     return asString();
1361   }
1362   
1363   /**
1364    * Returns the result of the stem (assuming the word was changed) as a String.
1365    */
1366   String asString() {
1367     String s = getString();
1368     if (s != null) return s;
1369     return word.toString();
1370   }
1371   
1372   CharSequence asCharSequence() {
1373     return result != null ? result : word;
1374   }
1375   
1376   String getString() {
1377     return result;
1378   }
1379   
1380   char[] getChars() {
1381     return word.getArray();
1382   }
1383   
1384   int getLength() {
1385     return word.length();
1386   }
1387   
1388   String result;
1389   
1390   private boolean matched() {
1391     /***
1392      * if (!lookups.contains(word.toString())) { throw new
1393      * RuntimeException("didn't look up "+word.toString()+" prev="+prevLookup);
1394      * }
1395      ***/
1396     // lookup();
1397     return matchedEntry != null;
1398   }
1399   
1400   /**
1401    * Stems the text in the token. Returns true if changed.
1402    */
1403   boolean stem(char[] term, int len) {
1404     
1405     result = null;
1406     
1407     k = len - 1;
1408     if ((k <= 1) || (k >= MaxWordLen - 1)) {
1409       return false; // don't stem
1410     }
1411     
1412     // first check the stemmer dictionaries, and avoid using the
1413     // cache if it's in there.
1414     DictEntry entry = dict_ht.get(term, 0, len);
1415     if (entry != null) {
1416       if (entry.root != null) {
1417         result = entry.root;
1418         return true;
1419       }
1420       return false;
1421     }
1422     
1423     /***
1424      * caching off is normally faster if (cache == null) initializeStemHash();
1425      * 
1426      * // now check the cache, before we copy chars to "word" if (cache != null)
1427      * { String val = cache.get(term, 0, len); if (val != null) { if (val !=
1428      * SAME) { result = val; return true; } return false; } }
1429      ***/
1430     
1431     word.reset();
1432     // allocate enough space so that an expansion is never needed
1433     word.reserve(len + 10);
1434     for (int i = 0; i < len; i++) {
1435       char ch = term[i];
1436       if (!isAlpha(ch)) return false; // don't stem
1437       // don't lowercase... it's a requirement that lowercase filter be
1438       // used before this stemmer.
1439       word.unsafeWrite(ch);
1440     }
1441     
1442     matchedEntry = null;
1443     /***
1444      * lookups.clear(); lookups.add(word.toString());
1445      ***/
1446     
1447     /*
1448      * This while loop will never be executed more than one time; it is here
1449      * only to allow the break statement to be used to escape as soon as a word
1450      * is recognized
1451      */
1452     while (true) {
1453       // YCS: extra lookup()s were inserted so we don't need to
1454       // do an extra wordInDict() here.
1455       plural();
1456       if (matched()) break;
1457       pastTense();
1458       if (matched()) break;
1459       aspect();
1460       if (matched()) break;
1461       ityEndings();
1462       if (matched()) break;
1463       nessEndings();
1464       if (matched()) break;
1465       ionEndings();
1466       if (matched()) break;
1467       erAndOrEndings();
1468       if (matched()) break;
1469       lyEndings();
1470       if (matched()) break;
1471       alEndings();
1472       if (matched()) break;
1473       entry = wordInDict();
1474       iveEndings();
1475       if (matched()) break;
1476       izeEndings();
1477       if (matched()) break;
1478       mentEndings();
1479       if (matched()) break;
1480       bleEndings();
1481       if (matched()) break;
1482       ismEndings();
1483       if (matched()) break;
1484       icEndings();
1485       if (matched()) break;
1486       ncyEndings();
1487       if (matched()) break;
1488       nceEndings();
1489       matched();
1490       break;
1491     }
1492     
1493     /*
1494      * try for a direct mapping (allows for cases like `Italian'->`Italy' and
1495      * `Italians'->`Italy')
1496      */
1497     entry = matchedEntry;
1498     if (entry != null) {
1499       result = entry.root; // may be null, which means that "word" is the stem
1500     }
1501     
1502     /***
1503      * caching off is normally faster if (cache != null && cache.size() <
1504      * maxCacheSize) { char[] key = new char[len]; System.arraycopy(term, 0,
1505      * key, 0, len); if (result != null) { cache.put(key, result); } else {
1506      * cache.put(key, word.toString()); } }
1507      ***/
1508     
1509     /***
1510      * if (entry == null) { if (!word.toString().equals(new String(term,0,len)))
1511      * { System.out.println("CASE:" + word.toString() + "," + new
1512      * String(term,0,len));
1513      * 
1514      * } }
1515      ***/
1516     
1517     // no entry matched means result is "word"
1518     return true;
1519   }
1520   
1521 }